home *** CD-ROM | disk | FTP | other *** search
- /*
- *
- * Author: Allen I. Holub
- *
- * (c) C Gazette. May be used freely as long as author and publication are
- * acknowledged
- *
- * Roy S. Woll (Revision 2.0)
- * 1032 Summerplace Dr.
- * San Jose, CA 95122
- *
- * ----------------------------------------------------------------------
- *
- *
- * Revision 2.02 14 Mar 1993 ROY S. WOLL
- *
- * Fixed octal set definition to include first octel digit
- *
- * Revision 2.0 16 Nov 1992 ROY S. WOLL
- *
- * Initial revision for match.c -> match.cpp
- * Compatibility with C++ syntax, and now member functions of regX.h
- * to avoid polluting the name space.
- * Fixed some inconsistencies. Regular expression compiled pattern should
- * now grow as needed. Case insensitive support.
- *
- * Revision 1 27 Jan 1991 Allen I. Holub
- *
- */
-
- #include <stdio.h>
- #include <ctype.h>
- #include <string.h>
-
- #include "regximp.h"
-
- inline const char * max(const char * x, const char * y)
- {if (x>y) return x; else return y;}
-
- /* Metacharacters in the input: */
- #define BOL '^' /* start-of-line anchor */
- #define EOL '$' /* end-of-line anchor */
- #define ANY '.' /* matches any character */
- #define CCL '[' /* start a character class */
- #define CCLEND ']' /* end a character class */
- #define NCCL '^' /* negates character class if 1st char. */
- #define CLOSURE '*' /* Kleene closure (matches 0 or more) */
- #define PCLOSE '+' /* Positive closure (1 or more) */
- #define OPT '?' /* Optional closure (0 or 1) */
-
- typedef enum action { // These are put in the pattern string
- // to represent metacharacters.
- M_BOL = (0x80 | '^'),
- M_EOL = (0x80 | '$'),
- M_ANY = (0x80 | '.'),
- M_CCL = (0x80 | '['),
- M_OPT = (0x80 | '?'),
- M_CLOSE = (0x80 | '*'),
- M_PCLOSE = (0x80 | '+')
- } action;
-
-
-
- typedef unsigned char pattern; /* pattern strings are unsigned char */
-
- #define IS_ACTION(x) ((x)&0x80) /* true => element of pat. string is an */
- /* action that represents a metacharacter */
-
- /*----------------------------------------------------------------------*/
- #define MAPSIZE 16 /* need this many bytes for character class bit map */
-
- /*
- * Advance a pointer into the pattern template
- * to the next pattern element, this is a +1 for
- * all pattern elements but M_CCL, where you
- * to skip past both the M_CCL character and the
- * bitmap that follows that character
- */
-
- #define ADVANCE(pat) (pat += (*pat == (pattern)M_CCL) ? (MAPSIZE+1) : 1)
-
- //
- // Bitmap functions. Set bit b in the map and
- // test bit b to see if it was set previously.
- //
- #define SETBIT(b,map) ((map)[((b) & 0x7f) >>3] |= (1<< ((b) & 0x07)) )
- #define TSTBIT(b,map) ((map)[((b) & 0x7f) >>3] & (1<< ((b) & 0x07)) )
-
- int regXimp::omatch(const char ** strp, const pattern * pat,
- const char * start)
- {
- /*
- * Match one pattern element, pointed at by pat, against the character at
- * **strp. Return 0 on a failure, 1 on success. *strp is advanced to skip
- * over the matched character on a successful match. Closure is handled one
- * level up by patcmp().
- *
- * "start" points at the character at the left edge of the line. This might
- * not be the same thing as *strp if the search is starting in the middle
- * of the string. An end-of- line anchor matches '\n' or '\0'.
- */
-
- int advance = -1; // amount to advance *strp, -1 == error
-
- switch (*pat) {
- case M_BOL: // First char in string?
- if (*strp == start) // Only one star here.
- advance = 0;
- break;
-
- case M_ANY: // . = anything but newline
- if (**strp != '\n') advance = 1;
- break;
-
- case M_EOL:
- if (**strp == '\n' || **strp == '\0')
- advance = 0;
- break;
-
- case M_CCL:
- if (TSTBIT(**strp, pat + 1)) advance = 1;
- break;
-
- default: /* literal match */
- if (caseSensitive){
- if (**strp == *pat) advance = 1;
- }
- else if (toupper(**strp) == toupper(*pat)) advance = 1;
- break;
- }
-
- if (advance > 0)
- *strp += advance;
-
- return (advance + 1);
- }
-
- #define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7')
-
- static int hex2bin(int c)
- {
- /* Convert the hex digit represented by 'c' to an int. 'c'
- * must be one of: 0123456789abcdefABCDEF
- */
- return (isdigit(c) ? (c)-'0': ((toupper(c))-'A')+10) & 0xf;
- }
-
- static int oct2bin(int c)
- {
- /* Convert the hex digit represented by 'c' to an int. 'c'
- * must be a digit in the range '0'-'7'.
- */
- return ( ((c)-'0') & 0x7 );
- }
-
-
-
- /*------------------------------------------------------------*/
-
- int esc(const char **s)
- {
- /* Map escape sequences into their equivalent symbols. Return
- * the equivalent ASCII character. *s is advanced past the
- * escape sequence. If no escape sequence is present, the
- * current character is returned and the string is advanced by
- * one. The following are recognized:
- *
- * \b backspace
- * \f formfeed
- * \n newline
- * \r carriage return
- * \s space
- * \t tab
- * \e ASCII ESC character ('\033')
- * \DDD number formed of 1-3 octal digits
- * \xDDD number formed of 1-3 hex digits
- * \^C C = any letter. Control code
- */
-
- int rval;
-
- if( **s != '\\' )
- rval = *( (*s)++ );
- else {
- ++(*s); // Skip the '\'
- switch( toupper(**s) ) {
- case '\0': rval = '\\'; break;
- case 'B': rval = '\b' ; break;
- case 'F': rval = '\f' ; break;
- case 'N': rval = '\n' ; break;
- case 'R': rval = '\r' ; break;
- case 'S': rval = ' ' ; break;
- case 'T': rval = '\t' ; break;
- case 'E': rval = '\033'; break;
-
- case '^':
- rval = *++(*s) ;
- rval = toupper(rval) - '@' ;
- break;
-
- case 'X':
- rval = 0;
- ++(*s);
- if( isxdigit(**s) ) {
- rval = hex2bin( *(*s)++ );
- }
- if( isxdigit(**s) ) {
- rval <<= 4;
- rval |= hex2bin( *(*s)++ );
- }
- if( isxdigit(**s) ) {
- rval <<= 4;
- rval |= hex2bin( *(*s)++ );
- }
- --(*s);
- break;
-
- default:
- if( !ISOCTDIGIT(**s) )
- rval = **s;
- else {
- rval = oct2bin( *(*s)++ );
- if( ISOCTDIGIT(**s) ) {
- rval <<= 3;
- rval |= oct2bin( *(*s)++ );
- }
- if( ISOCTDIGIT(**s) ) {
- rval <<= 3;
- rval |= oct2bin( *(*s)++ );
- }
- --(*s);
- }
- break;
- }
- ++(*s);
- }
- return rval;
- }
-
-
- /*----------------------------------------------------------------------*/
- const char * regXimp::doccl(const char * src)
- {
- /*
- * Set bits in the map corresponding to characters specified in the src
- * character class.
- */
-
-
- int first, last, negative;
-
- ++src; // skip past the [
- negative = (*src == NCCL);
- if (negative) ++src; // check for negative ccl
-
- const char * start = src; // start of characters in class
-
- int len = compiledPattern.length();
- compiledPattern.pad(len+MAPSIZE, str::right, char(0));
- char * map = (char *)compiledPattern(len);
-
- while (*src && *src != CCLEND) {
- if (*src != '-') {
- first = esc(&src); // Use temp. to avoid macro
- // side effects.
- SETBIT(first, map);
-
- } else if (src == start) {
- SETBIT('-', map); // literal dash at start or end
- ++src;
- } else {
- ++src; // skip to end-of-sequence char
-
- // Support special characters within [].
- last = esc(&src);
- if (last < first){
- int temp=first;
- first = last;
- last = temp;
- };
-
- while (++first <= last) SETBIT(first, map);
- }
- }
-
- if (*src == CCLEND) ++src; // Skip CCLEND
-
- if (negative)
- for (first = MAPSIZE; --first >= 0;)
- *map++ ^= ~0; /* invert all bits */
-
- return src;
- }
-
- /*----------------------------------------------------------------------*/
- const char * regXimp::patcmp(const char * sstr, const pattern * pat,
- const char * start)
- {
- /*
- * Like strcmp, but compares str against pat. Each element of str is
- * compared with the template until either a mis-match is found or the end
- * of the template is reached. In the former case a 0 is returned; in the
- * latter, a pointer into str (pointing to the last character in the
- * matched pattern) is returned. Strstart points at the first character in
- * the string, which might not be the same thing as line if the search
- * started in the middle of the string.
- */
-
- const char *bocl; // beginning of closure string.
- const char *end=0; // return value: end-of-string pointer.
-
- if (!pat) return (NULL); // make sure pattern is valid
-
- while (*pat) {
-
- if (*pat == (pattern)M_OPT) {
- /*
- * Zero or one matches. It doesn't matter if omatch fails---it will
- * advance str past the character on success, though. Always advance
- * the pattern past both the M_OPT and the operand.
- */
-
- omatch(&sstr, ++pat, start);
-
- ADVANCE(pat);
- } else if (!(*pat == (pattern)M_CLOSE || *pat == (pattern)M_PCLOSE)) {
-
- //
- // Do a simple match. Note that omatch() fails if there's still
- // something in pat but we're at end of string.
- //
- if (!omatch(&sstr, pat, start)) return NULL;
-
- ADVANCE(pat);
-
- } else { // Process a Kleene or positive closure
-
- if (*pat++ == (pattern)M_PCLOSE) // one match required
- if (!omatch(&sstr, pat, start)) return NULL;
-
- // Match as many as possible, zero is okay
- bocl = sstr;
- while (*sstr && omatch(&sstr, pat, start));
-
- /*
- * 'str' now points to the character that made made us fail. Try to
- * process the rest of the string. If the character following the
- * closure could have been in the closure (as in the pattern "[a-z]*t")
- * the final 't' will be sucked up in the while loop. So, if the match
- * fails, back up a notch and try to match the rest of the string
- * again, repeating this process recursively until we get back to the
- * beginning of the closure. The recursion goes, at most, one levels
- * deep.
- */
-
-
- if (*ADVANCE(pat)) {
- do {
- end = patcmp(sstr, pat, start);
- if (end) break;
- sstr--;
-
- } while (bocl <= sstr);
- return end;
- }
-
- break;
- };
-
- }
-
- /*
- * omatch() advances str to point at the next character to be matched. So
- * str points at the character following the last character matched when
- * you reach the end of the template. The exceptions are templates
- * containing only a BOLN or EOLN token. In these cases omatch doesn't
- * advance. Since we must return a pointer to the last matched character,
- * decrement str to make it point at the end of the matched string, making
- * sure that the decrement hasn't gone past the beginning of the string.
- *
- * Note that $ is a position, not a character, but in the case of a pattern
- * ^$, a pointer to the end of line character is returned. In ^xyz$, a
- * pointer to the z is returned.
- *
- * The --str is done outside the return statement because max() is a macro
- * with side-effects.
- */
-
- --sstr;
-
- return (max(start, sstr));
- }
-
-
- /*----------------------------------------------------------------------*/
- int regXimp::makepat(const char * exp)
- {
- /*
- * Make a pattern template from the string pointed to by exp.
- * The pattern template is assembled in str compiledPattern.
- * Regular expression can contain one or more \n characters.
- *
- * Return 1 if success
- */
-
- error = 1;
-
- if (!*exp) goto exit;
-
- if (*exp == CLOSURE || *exp == PCLOSE || *exp == OPT) goto exit;
-
- int prev, len;
-
- len=0;
-
- while (*exp ) {
-
- switch (*exp) {
- case ANY:
- prev=len++;
- compiledPattern += (pattern)M_ANY;
- ++exp;
- break;
-
- case BOL:
- prev=len++;
- compiledPattern += (!compiledPattern.length()) ? (pattern)M_BOL : *exp;
- ++exp;
- break;
-
- case EOL:
- prev=len++;
- compiledPattern += (!exp[1] || exp[1] == '\n') ? (pattern)M_EOL : *exp;
- ++exp;
- break;
-
- case CCL:
- prev=len++;
- compiledPattern += (pattern)M_CCL;
- exp = doccl(exp);
- len += MAPSIZE;
- break;
-
- case OPT:
- case CLOSURE:
- case PCLOSE:
- switch (compiledPattern[prev]) {
- case M_BOL:
- case M_EOL:
- case M_OPT:
- case M_PCLOSE:
- case M_CLOSE:
- goto exit;
- }
-
- compiledPattern.insert(prev,
- (*exp == OPT) ? (pattern)M_OPT :
- (*exp == PCLOSE) ? (pattern)M_PCLOSE : (pattern)M_CLOSE);
-
- len++;
- ++exp;
- break;
-
- default:
- prev=len++;
- compiledPattern += esc(&exp);
- break;
- }
- }
-
- error = 0;
-
- exit:
- return (!error);
- }
-
- /*----------------------------------------------------------------------*/
- const char * regXimp::matchs(const char * sstr, int p_case)
- {
- const char *endp = NULL;
- const char *start;
- const unsigned char * pat = (unsigned char *)compiledPattern();
-
- caseSensitive = p_case;
-
- if (!pat) return NULL;
-
- if (*sstr == '\0') {
- if ((*pat == (pattern)M_EOL) ||
- (*pat == (pattern)M_BOL && (!pat[1] || pat[1] == (pattern)M_EOL)))
- {
- endp = sstr;
- endMatch = startMatch = sstr;
- }
- }
- else {
- start = sstr; // Do a brute-force substring search,
- // comparing a pattern against the input string
-
- while (*sstr) {
- endp = patcmp(sstr, pat, start);
-
- if (endp) {
- endMatch = endp;
- startMatch = sstr;
-
- break;
- };
- sstr++;
- }
- }
- return endp;
- }
-
-